# Setup file locations to import
beer_csv_loc <- "./StartingDBs/Beers.csv"
breweries_csv_loc <- "./StartingDBs/Breweries.csv"
# Read in data
beer_data <- read.csv(beer_csv_loc, header = TRUE)
brewery_data <- read.csv(breweries_csv_loc, header = TRUE)
# Create a seperate data frame to store state data
### Note: This data set counts the district of columbia as a state
num_of_breweries_by_state <- data.frame(table(brewery_data$State))
#the last six observations to check the merged file
# Change column name of beer_data to match brewery_data to use as a primary key
colnames(beer_data)[5] <- "Brew_ID"
# Merge two data bases using merge
full_brew_data <- merge(brewery_data, beer_data, by="Brew_ID")
# Rename columns 2 and 5 that were changed during the merge
colnames(full_brew_data)[2] <- "Brewery"
colnames(full_brew_data)[5] <- "Beer_Name"
# Print first 6 and last 6 observations
head(full_brew_data, n=6)
## Brew_ID Brewery City State Beer_Name Beer_ID ABV IBU
## 1 1 NorthGate Brewing Minneapolis MN Pumpion 2689 0.060 38
## 2 1 NorthGate Brewing Minneapolis MN Stronghold 2688 0.060 25
## 3 1 NorthGate Brewing Minneapolis MN Parapet ESB 2687 0.056 47
## 4 1 NorthGate Brewing Minneapolis MN Get Together 2692 0.045 50
## 5 1 NorthGate Brewing Minneapolis MN Maggie's Leap 2691 0.049 26
## 6 1 NorthGate Brewing Minneapolis MN Wall's End 2690 0.048 19
## Style Ounces
## 1 Pumpkin Ale 16
## 2 American Porter 16
## 3 Extra Special / Strong Bitter (ESB) 16
## 4 American IPA 16
## 5 Milk / Sweet Stout 16
## 6 English Brown Ale 16
tail(full_brew_data, n=6)
## Brew_ID Brewery City State
## 2405 556 Ukiah Brewing Company Ukiah CA
## 2406 557 Butternuts Beer and Ale Garrattsville NY
## 2407 557 Butternuts Beer and Ale Garrattsville NY
## 2408 557 Butternuts Beer and Ale Garrattsville NY
## 2409 557 Butternuts Beer and Ale Garrattsville NY
## 2410 558 Sleeping Lady Brewing Company Anchorage AK
## Beer_Name Beer_ID ABV IBU Style Ounces
## 2405 Pilsner Ukiah 98 0.055 NA German Pilsener 12
## 2406 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA) 12
## 2407 Snapperhead IPA 51 0.068 NA American IPA 12
## 2408 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout 12
## 2409 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen 12
## 2410 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale 12
# Create two new tables where one has only ABV with no <NA>'s and the other
state_abv <- data.frame(State=full_brew_data$State, ABV=full_brew_data$ABV)
state_abv <- state_abv %>% drop_na(ABV)
# Do the same for IBU
state_ibu <- data.frame(State=full_brew_data$State, IBU=full_brew_data$IBU)
state_ibu <- state_ibu %>% drop_na(IBU)
##state. Plot a bar chart
# Find the average ABV for the state_abv table
avg_state_abv <- ddply(state_abv, .(State), function(x) mean(x$ABV))
colnames(avg_state_abv)[2] <- "Average_ABV" # Rename the column
# Drop DC. It's not a state....yet
avg_state_abv <- avg_state_abv[-c(8),] # DC is equal to position 8 on the table
# Order the states by Decending ABV content
avg_state_abv <- avg_state_abv[order(avg_state_abv$Average_ABV, decreasing=TRUE),]
# Find Average IBU
avg_state_ibu <- ddply(state_ibu, .(State), function(x) mean(x$IBU))
colnames(avg_state_ibu)[2] <- "Average_IBU" # Rename column
# Order the states by IBU Decending
avg_state_ibu <- avg_state_ibu[order(avg_state_ibu$Average_IBU, decreasing=TRUE),]
# Plot Average Alcohol by Volume per state
ggplot(avg_state_abv, aes(x=reorder(State, desc(Average_ABV)), y=Average_ABV, fill=State)) +
geom_col(show.legend = FALSE, width=.9, position="dodge") +
xlab("State") +
ylab("Average Alcohol by Volume")
# Plot Average International Bitterness Unit per State
ggplot(avg_state_ibu, aes(x=reorder(State, desc(Average_IBU)), y=Average_IBU, fill=State)) +
geom_col(show.legend = FALSE, width=.9, position="dodge") +
xlab("State") +
ylab("Average IBU")
# Get the State with the highest average ABV
state_max_abv <- avg_state_abv[which.max(avg_state_abv$Average_ABV),]
# Get the State with the highest average IBU
state_max_ibu <- avg_state_ibu[which.max(avg_state_ibu$Average_IBU),]
#and other types of Ale. Use KNN.
# Create a data frame with only IPA's
#df1[grep("dog",df1$type),]
ipa_beers <- beer_data_no_na[grep("IPA",beer_data_no_na$Style),]
# Change the Sytle column to only have the value of IPA for easier model training
ipa_beers$Style <- "IPA"
ales_beer <- beer_data_no_na[grep("Ale",beer_data_no_na$Style),]
# Change the Sytle column to only have the value of Ale for easier model training
ales_beer$Style <- "Ale"
# Combine the two data sets
ipas_and_ales <- rbind(ipa_beers, ales_beer)
# plot the data to get an initial view of the data
ggplot(ipas_and_ales, aes(x=ABV, y=IBU, colour=Style)) + geom_point()
# Create train and test data frames
ipas_and_ales_train <- ipas_and_ales # train data needs to be a duplicate of orig
ipas_and_ales_test <- data.frame()
# Split 70/30 (Train/Test)
samples_to_take <- round(as.numeric(nrow(ipas_and_ales_train)*.3), digits=0)
for (row in 1:samples_to_take) {
tmp_row <- sample_n(ipas_and_ales_train, 1) # Take sample
# Add Sample to test data
ipas_and_ales_test <- ipas_and_ales_test %>% rbind(tmp_row)
# Remove Sample from train data
ipas_and_ales_train <- ipas_and_ales_train[!(ipas_and_ales_train$Beer_ID==tmp_row$Beer_ID),]
}
# find an optimal K value for the KNN model
style_accuracy <- data.frame(accuract=numeric(50), k=numeric(50))
for (iter in 1:50) {
style_class <- knn(ipas_and_ales_train[,c(3,4)], ipas_and_ales_test[c(3,4)],
ipas_and_ales_train$Style,
prob=TRUE, k=iter)
table(ipas_and_ales_test$Style, style_class)
cm <- confusionMatrix(table(ipas_and_ales_test$Style, style_class))
style_accuracy$accuracy[iter] <- cm$overall[1]
style_accuracy$k[iter] <- iter
}
#plot(style_accuracy$k, style_accuracy$accuracy, type="l", xlab="k")
plot_ly(style_accuracy, x=style_accuracy$k, y=style_accuracy$accuracy,
type="scatter", mode="lines")
##
## On average this loop gives me an optimal k level of k=5,6,7
##
6. Comment on the Summary stats and distribution of the ABV variable
##7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content?